Load library and path

Read raw .dcc and LabWorksheets

GeomxTools::NanoStringGeoMxSet object conversion to SpatialExperiment

disease = "lung"
spe <- readRDS(file.path(read_path, paste0(disease, ".rds")))
# spe <- readRDS(file.path(save_path, paste0(disease, "_qcd.rds"))) 

Normalization

## scran Normalization
library(scran)
spe <- computeSumFactors(spe, assay.type="counts")
data_norm_pre <- sweep(assays(spe)$counts, 2, spe$sizeFactor,'/')
assays(spe, withDimnames=FALSE)$scrannormcounts <- log(data_norm_pre + 1)

## logNormCounts - mean count normalization
assays(spe, withDimnames=FALSE)$meannormcounts <- assay(logNormCounts(spe, assay.type = "counts"), "logcounts")

## TMM
spe_tmm <- geomxNorm(spe, method = "TMM", log = FALSE)
assays(spe, withDimnames=FALSE)$tmmcounts <- assay(spe_tmm, "logcounts")

## Upperquartile (Q3)
spe_upperquatile <- geomxNorm(spe, method = "upperquartile", log = FALSE)
assays(spe, withDimnames=FALSE)$upperquartile <- assay(spe_upperquatile, "logcounts")

## Quantile
assays(spe, withDimnames=FALSE)$quantile <- normalize.quantiles(assay(spe, "log1p")) # preprocessCore::normalize.quantile on top of log1p

## scTransform
assay(spe, "scTransform", withDimnames = FALSE) <- sctransform::vst(counts(spe), min_cells = 0)$y

RLE for two normalization methods. One replicate of L3 (L3_3) and L4 (L4_3) were removed by gene detection rate QC. All low signal AOIs were removed.

plotRLExpr(spe, assay = "counts", color = slide_name) + ggtitle("Raw") + theme(legend.position = "none") |
plotRLExpr(spe, assay = "logcounts", color = slide_name) + ggtitle("logCPM") + theme(legend.position = "none") |
  plotRLExpr(spe, assay = "quantile", color = slide_name) + ggtitle("Quantile")

Check by sample, for breast are pretty well aligned.

plotRLExpr(spe, assay = "counts", color = sample_ids) + ggtitle("Raw") + theme(legend.position = "none") |
plotRLExpr(spe, assay = "logcounts", color = sample_ids) + ggtitle("logCPM") + theme(legend.position = "none") |
  plotRLExpr(spe, assay = "quantile", color = sample_ids) + ggtitle("Quantile")

PCA before batch correction

For breast samples, we can see that even if quantile normalization scales the relative expression onto the same scale, there are still a bit of batch effect to be corrected. Admittedly, the slides are pretty well aligned even before batch correction.

assay = 8
which.assay = "quantile"

set.seed(100)
spe <- scater::runPCA(spe, assay.type = which.assay)
set.seed(100)
spe <- scater::runUMAP(spe, dimred = "PCA")
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'

I propose to update manuscript Figure 1c to these three factors, slide (batch effect), segment (biology of interest), and sample (to show within patient replicates agreement after normalization). Maybe the change of slide and segment will not make it into the manuscript figure, but I keep them here for examining the effect of batch corrections. Also keep the GeoMx and Visium aesthetic in sync.

plotDimRed(spe, type = "PCA", annotate = "section_id", pt.size = 1.5) + 
  theme(legend.position = "bottom",
        legend.title=element_blank(),
        axis.text = element_blank(),
        axis.title = element_blank()) + 
  ggtitle("Lung")

# p1 <- plotDR(spe, dimred = "PCA", col = Section_ID) + xlab("") + ylab("") + 
#   theme(legend.position = "bottom", 
#         legend.title=element_blank(), 
#         axis.text.x = element_blank(),
#         axis.text.y = element_blank(),
#         # axis.ticks = element_blank(),
#         panel.border = element_blank(),
#         # plot.title = element_text(hjust = 0.5),
#         axis.line.x = element_line(size = 0.5, linetype = "solid", colour = "black"),
#         axis.line.y = element_line(size = 0.5, linetype = "solid", colour = "black")) + 
#   ggtitle("Breast") 
# p1
p2 <- plotDR(spe, dimred = "PCA", col = slide_name) + xlab("PC_1") + ylab("PC_2") + 
  theme(legend.position = "right", legend.title=element_blank(), plot.title = element_text(hjust = 0.5)) + ggtitle("Slide")
p3 <- plotDR(spe, dimred = "PCA", col = cell_fraction) + xlab("PC_1") + ylab("PC_2") + 
  theme(legend.position = "right", legend.title=element_blank(), plot.title = element_text(hjust = 0.5)) + ggtitle("Segment")

p2 | p3

UMAP before batch correction

For breast

plotDR(spe, dimred = "UMAP", col = slide_name) | plotDR(spe, dimred = "UMAP", col = cell_fraction)

Batch correction

For lung, RUV k = 9 is enough.

# Batch correction --------------------------------------------------------
spe <- findNCGs(spe, batch_name = "slide_name", top_n = 300)
## New names:
## • `cv` -> `cv...1`
## • `cv` -> `cv...2`
## • `cv` -> `cv...3`
metadata(spe) |> names()
##  [1] "Panel"             "PKCFileName"       "PKCModule"        
##  [4] "PKCFileVersion"    "PKCFileDate"       "AnalyteType"      
##  [7] "MinArea"           "MinNuclei"         "shiftedByOne"     
## [10] "shiftedByOneLogic" "sequencingMetrics" "QCMetrics"        
## [13] "lcpm_threshold"    "genes_rm_rawCount" "genes_rm_logCPM"  
## [16] "NCGs"
## Max biology cluster distinction, and minimize batch distinction
for(i in seq(9)){
  spe_ruv <- geomxBatchCorrection(spe, factors = "cell_fraction", n_assay = assay,
                                  NCGs = metadata(spe)$NCGs, k = i)
  
  print(plotPairPCA(spe_ruv, assay = "logcounts", n_dimension = 4, color = cell_fraction, title = paste0("k = ", i)))
  
}

PCA after batch correction.

Now slides are perfectly mixed in PC1 and PC2.

spe_ruv <- geomxBatchCorrection(spe, factors = "cell_fraction", n_assay = assay,
                                NCGs = metadata(spe)$NCGs, k = 9)
set.seed(100)
spe_ruv <- scater::runPCA(spe_ruv, assay.type = "logcounts")

pca_results_ruv <- reducedDim(spe_ruv, "PCA")

plotPairPCA(spe_ruv, assay = "logcounts", precomputed = pca_results_ruv, 
            color = cell_fraction, title = "RUV4, k = 9", n_dimension = 4)

plotPairPCA(spe_ruv, assay = "logcounts", precomputed = pca_results_ruv, 
            color = slide_name, title = "RUV4, k = 9", n_dimension = 4)

In UMAP, biology well separated, and batch effect well mixed.

## UMAP after batch correction
set.seed(100)
spe_ruv <- scater::runUMAP(spe_ruv, dimred = "PCA")
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
plotDR(spe_ruv, dimred = "UMAP", col = cell_fraction) |
  plotDR(spe_ruv, dimred = "UMAP", col = slide_name)

After batch correction, more agreement between samples. But for manuscript, show the after norm but before batch PCA. This is PCA after batched.

plotDimRed(spe_ruv, type = "PCA", annotate = "section_id", pt.size = 1.5) + 
  theme(legend.position = "bottom",
        legend.title=element_blank(),
        axis.text = element_blank(),
        axis.title = element_blank()) + 
  ggtitle("Lung - PCA")

After batch correction, more agreement between samples. But for manuscript, show the after norm but before batch PCA.

plotDimRed(spe_ruv, type = "UMAP", annotate = "section_id", pt.size = 1.5) + 
  theme(legend.position = "bottom",
        legend.title=element_blank(),
        axis.text = element_blank(),
        axis.title = element_blank()) + 
  ggtitle("Lung - UMAP")

####################################################################
#                           Deconvolution                          #
####################################################################
# NegProbe name
negProbeName <- rownames(spe)[which(grepl("Neg", rownames(spe)))]

# ## Normed
spd_normed <- prepareSpatialDecon(spe, assay2use = "quantile", negProbeName = negProbeName)

## Normed Batched
# force the normed, batch corrected count matrix to be not a df, but just matrix
assay(spe_ruv, "logcounts") <- as.matrix(assay(spe_ruv, "logcounts"))
spd_normed_batched <- prepareSpatialDecon(spe_ruv, assay2use = "logcounts", negProbeName = negProbeName)

# names(spd_normed)
# names(spd_normed_batched)

Load signature matrix derived from most differentially expressed genes from Chromium.

# Signature matrix
new_ref_matrix <- as.matrix(read.csv(paste0(absolute_path_cur, "Owkin_Pilot_Intermediate/GeoMx/Signature_matrices_CHUV_lung_breast.csv"), 
                                     row.names = 1))

Run deconvolution with Chromium annotated reference matrix

library(SpatialDecon)
res_norm <- spatialdecon(norm = spd_normed$normCount,
                               bg = spd_normed$backGround,
                               X = new_ref_matrix,
                               # cellmerges = safeTME.matches,
                               align_genes = TRUE)
# samples_subset_Malig <- colnames(spe)[spe$Cell_fraction %in%  c("Malignant")]
# subset_prop_Malig <- res_norm$prop_of_all[, samples_subset_Malig] # 18/28 cell types * 63 AOIs of Malig

subset_prop <- res_norm$prop_of_all

Make a deconvolution boxplot

df <- data.frame(t(subset_prop))

df$sample <- rownames(df)

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
gathered_df <- gather(df, key = "column", value = "value", -sample)

CD <- as.data.frame(colData(spe)[, c("sample_id2", "section_id", "cell_fraction")])
CD <- CD %>% rename(sample = sample_id2)

library(dplyr)
gathered_df <- gathered_df %>%
  left_join(CD, by = "sample") %>%
  rename(CellType = column,
         Fraction = value)

# Plot
p <- ggplot(gathered_df, aes(x=CellType, y=Fraction)) +
  geom_boxplot() +
  geom_jitter(aes(color=section_id), size=1, alpha=0.9) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  facet_wrap(~cell_fraction, ncol = 5) + 
  ggtitle("Quantile Normed Expr Decon Result")
p

library(SpatialDecon)
res_norm_batch <- spatialdecon(norm = spd_normed_batched$normCount,
                               bg = spd_normed_batched$backGround,
                               X = new_ref_matrix,
                               # cellmerges = safeTME.matches,
                               align_genes = TRUE)

# samples_subset_Malig <- colnames(spe_ruv)[spe_ruv$Cell_fraction %in%  c("Malignant")]
# subset_prop_Malig <- res_norm_batch$prop_of_all[, samples_subset_Malig] # 18/28 cell types * 63 AOIs of Malig

subset_prop <- res_norm_batch$prop_of_all

Make deconvolution boxplot after normalization and batch correction.

df <- data.frame(t(subset_prop))

df$sample <- rownames(df)

library(tidyr)
gathered_df <- gather(df, key = "column", value = "value", -sample)

CD <- as.data.frame(colData(spe)[, c("sample_id2", "section_id", "cell_fraction")])
CD <- CD %>% rename(sample = sample_id2)

library(dplyr)
gathered_df <- gathered_df %>%
  left_join(CD, by = "sample") %>%
  rename(CellType = column,
         Fraction = value)

# Plot
p <- ggplot(gathered_df, aes(x=CellType, y=Fraction)) +
  geom_boxplot() +
  geom_jitter(aes(color=section_id), size=1, alpha=0.9) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  facet_wrap(~cell_fraction, ncol = 5) + 
  ggtitle("Quantile Normed Batch Corrected Expr Decon Result")
p